{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\" style=\"margin-top: 1em;\"><ul class=\"toc-item\"><li><span><a href=\"#Load-and-Prepare-Text8-data\" data-toc-modified-id=\"Load-and-Prepare-Text8-data-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Load and Prepare Text8 data</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RNN in Keras for Text Data (NLP) <a class=\"tocSkip\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NumPy:1.13.1\n",
      "Matplotlib:2.1.0\n",
      "TensorFlow:1.4.1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Keras:2.0.9\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "import numpy as np\n",
    "np.random.seed(123)\n",
    "print(\"NumPy:{}\".format(np.__version__))\n",
    "\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.pylab import rcParams\n",
    "rcParams['figure.figsize']=15,10\n",
    "print(\"Matplotlib:{}\".format(mpl.__version__))\n",
    "\n",
    "import tensorflow as tf\n",
    "tf.set_random_seed(123)\n",
    "print(\"TensorFlow:{}\".format(tf.__version__))\n",
    "\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, LSTM, Activation\n",
    "from keras.losses import mean_squared_error as k_mse\n",
    "from keras.backend import sqrt as k_sqrt\n",
    "import keras.backend as K\n",
    "import keras\n",
    "print(\"Keras:{}\".format(keras.__version__))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATASETSLIB_HOME = os.path.join(os.path.expanduser('~'),'dl-ts','datasetslib')\n",
    "import sys\n",
    "if not DATASETSLIB_HOME in sys.path:\n",
    "    sys.path.append(DATASETSLIB_HOME)\n",
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "import datasetslib\n",
    "\n",
    "from datasetslib import util as dsu\n",
    "datasetslib.datasets_root = os.path.join(os.path.expanduser('~'),'datasets')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Generation with Text8 Data in Keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and Prepare Text8 data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Already exists: /home/armando/datasets/text8/text8.zip\n",
      "Train: [  8 497   7   5 116]\n",
      "Vocabulary Length =  1457\n"
     ]
    }
   ],
   "source": [
    "from datasetslib.text8 import Text8\n",
    "text8 = Text8()\n",
    "text8.load_data(clip_at=5000) # downloads data, converts words to ids, converts files to a list of ids\n",
    "print('Train:', text8.part['train'][0:5])\n",
    "#print(text8.part['test'][0:5])\n",
    "#print(text8.part['valid'][0:5])\n",
    "print('Vocabulary Length = ',text8.vocab_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing\n"
     ]
    }
   ],
   "source": [
    "def id2string(ids):\n",
    "    return ' '.join([text8.id2word[x_i] for x_i in ids])\n",
    "print(id2string(text8.part['train'][0:100]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random 5 words:  free bolshevik be n another\n",
      "First 5 words:  anarchism originated as a term\n"
     ]
    }
   ],
   "source": [
    "# parameters\n",
    "\n",
    "n_x = 5 # number of input words\n",
    "n_y = 1 # number of output words\n",
    "n_x_vars = 1 # in case of our text, there is only 1 variable at each timestep\n",
    "n_y_vars = text8.vocab_len\n",
    "\n",
    "random5 = np.random.choice(n_x * 50, n_x, replace=False)\n",
    "print('Random 5 words: ',id2string(random5))\n",
    "first5 = text8.part['train'][0:n_x].copy()\n",
    "print('First 5 words: ',id2string(first5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reset the jupyter buffers\n",
    "tf.reset_default_graph()\n",
    "keras.backend.clear_session()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "lstm_1 (LSTM)                (None, 128)               66560     \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 1457)              187953    \n",
      "_________________________________________________________________\n",
      "activation_1 (Activation)    (None, 1457)              0         \n",
      "=================================================================\n",
      "Total params: 254,513\n",
      "Trainable params: 254,513\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "Random 5 words:  free bolshevik be n another\n",
      "First 5 words:  anarchism originated as a term\n",
      "\n",
      "Let's train and predict now:\n",
      "\n",
      "Epoch:  99\n",
      "  Random5 prediction: anarchistic anarchistic wrote wrote wrote wrote wrote wrote wrote wrote\n",
      "  First5 prediction: self nature nature war than than than than than than\n",
      "Epoch:  199\n",
      "  Random5 prediction: anarchistic anarchistic wrote wrote wrote wrote wrote wrote wrote wrote\n",
      "  First5 prediction: self i nature french french french french french french french\n",
      "Epoch:  299\n",
      "  Random5 prediction: anarchistic anarchistic wrote wrote wrote wrote wrote wrote wrote wrote\n",
      "  First5 prediction: term i revolutionary revolutionary french french french french french french\n",
      "Epoch:  399\n",
      "  Random5 prediction: anarchistic anarchistic amongst wrote wrote wrote wrote wrote wrote wrote\n",
      "  First5 prediction: term i revolutionary revolutionary french french french french french french\n",
      "Epoch:  499\n",
      "  Random5 prediction: tolstoy anarchistic amongst wrote wrote wrote wrote wrote wrote wrote\n",
      "  First5 prediction: term i revolutionary revolutionary french french french french french french\n",
      "Epoch:  599\n",
      "  Random5 prediction: tolstoy anarchistic true wrote wrote wrote wrote wrote wrote wrote\n",
      "  First5 prediction: term i revolutionary revolutionary french french french french french french\n",
      "Epoch:  699\n",
      "  Random5 prediction: tolstoy anarchistic true tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy\n",
      "  First5 prediction: term i revolutionary revolutionary had french french french french french\n",
      "Epoch:  799\n",
      "  Random5 prediction: tolstoy anarchistic tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy\n",
      "  First5 prediction: term i revolutionary revolutionary french french french french french french\n",
      "Epoch:  899\n",
      "  Random5 prediction: tolstoy anarchistic tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy\n",
      "  First5 prediction: term i revolutionary revolutionary had french french french french french\n",
      "Epoch:  999\n",
      "  Random5 prediction: tolstoy anarchistic tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy\n",
      "  First5 prediction: term i revolutionary revolutionary had french french french french french\n"
     ]
    }
   ],
   "source": [
    "# get the data\n",
    "x_train, y_train = text8.seq_to_xy(seq=text8.part['train'],n_tx=n_x,n_ty=n_y)\n",
    "\n",
    "# reshape input to be [samples, time steps, features]\n",
    "x_train = x_train.reshape(x_train.shape[0], x_train.shape[1],1)\n",
    "#x_test = x_test.reshape(x_test.shape[0], X_train.shape[1], 1)\n",
    "y_onehot = np.zeros(shape=[y_train.shape[0],text8.vocab_len],dtype=np.float32)\n",
    "for i in range(y_train.shape[0]):\n",
    "    y_onehot[i,y_train[i]]=1\n",
    "    \n",
    "\n",
    "# parameters\n",
    "n_epochs = 1000\n",
    "batch_size=128\n",
    "state_size=128\n",
    "n_epochs_display=100\n",
    "            \n",
    "# create and fit the LSTM model\n",
    "model = Sequential()\n",
    "model.add(LSTM(units=state_size, \n",
    "               input_shape=(x_train.shape[1], x_train.shape[2]),\n",
    "               return_sequences=False\n",
    "              )\n",
    "         )\n",
    "model.add(Dense(text8.vocab_len))\n",
    "model.add(Activation('softmax'))\n",
    "\n",
    "model.compile(loss='categorical_crossentropy', optimizer='adam')\n",
    "model.summary()\n",
    "\n",
    "#random5 = np.random.choice(n_x * 100, n_x, replace=False)\n",
    "print('Random 5 words: ',id2string(random5))\n",
    "#first5 = text8.part['train'][0:n_x].copy()\n",
    "print('First 5 words: ',id2string(first5))\n",
    "\n",
    "print('\\nLet\\'s train and predict now:\\n')\n",
    "for j in range(n_epochs // n_epochs_display):\n",
    "    model.fit(x_train, y_onehot, epochs=n_epochs_display, batch_size=batch_size,verbose=0)\n",
    "\n",
    "    # generate text\n",
    "    y_pred_r5 = np.empty([10])\n",
    "    y_pred_f5 = np.empty([10])\n",
    "\n",
    "    x_test_r5 = random5.copy()\n",
    "    x_test_f5 = first5.copy()\n",
    "    # let us generate text of 10 words after feeding 5 words\n",
    "    for i in range(10):\n",
    "        for x,y in zip([x_test_r5,x_test_f5],[y_pred_r5,y_pred_f5]):\n",
    "            x_input = x.copy()\n",
    "            x_input = x_input.reshape(-1, n_x, n_x_vars)\n",
    "            y_pred = model.predict(x_input)[0]\n",
    "            y_pred_id = np.argmax(y_pred)\n",
    "            y[i]=y_pred_id\n",
    "            x[:-1] = x[1:]\n",
    "            x[-1] = y_pred_id\n",
    "    print('Epoch: ',((j+1) * n_epochs_display)-1)\n",
    "    print('  Random5 prediction:',id2string(y_pred_r5))\n",
    "    print('  First5 prediction:',id2string(y_pred_f5))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
